In [53]:
# Install Required Libraries (Uncomment if needed)
# !pip install xgboost
# !pip install prophet
# Standard Libraries
import numpy as np
import pandas as pd
# Data Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import MarkerCluster
import plotly.graph_objects as go
# Geospatial Libraries
import geopandas as gpd
# Statistical and Scientific Libraries
from scipy import stats
from scipy.stats import zscore, boxcox
import scipy.cluster.hierarchy as sch # For clustering dendrograms
# Machine Learning - Preprocessing
from sklearn.preprocessing import StandardScaler, PowerTransformer, LabelEncoder
from sklearn.impute import SimpleImputer
# Enable experimental IterativeImputer before importing it
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.feature_extraction.text import TfidfVectorizer
# Machine Learning - Model Selection
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
# Machine Learning - Supervised Learning
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier, XGBRegressor
# Machine Learning - Unsupervised Learning
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.decomposition import PCA
# Machine Learning - Time Series
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
# Machine Learning - Evaluation Metrics
from sklearn.metrics import (
accuracy_score, f1_score, roc_auc_score, classification_report, # Classification
mean_absolute_error, mean_squared_error, r2_score # Regression
)
# Natural Language Processing (NLP)
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# Ensure NLTK resources are downloaded
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
[nltk_data] Downloading package stopwords to [nltk_data] /Users/bcoeur34/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to /Users/bcoeur34/nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] /Users/bcoeur34/nltk_data... [nltk_data] Package wordnet is already up-to-date!
Out[53]:
True
In [ ]:
Data Pre-processing / Explore a DataFrame¶
In [55]:
# Load the NBA 2024 dataset from a local CSV file into a pandas DataFrame
df = pd.read_csv("/Users/bcoeur34/Desktop/Python /nba_2024.csv")
# Display the first five rows of the DataFrame to preview the data
df.head()
Out[55]:
| Rk | Player | Age | Team | Pos | G | GS | MP | FG | FGA | ... | DRB | TRB | AST | STL | BLK | TOV | PF | PTS | Awards | Player-additional | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Joel Embiid | 29 | PHI | C | 39 | 39 | 33.6 | 11.5 | 21.8 | ... | 8.6 | 11.0 | 5.6 | 1.2 | 1.7 | 3.8 | 2.9 | 34.7 | AS | embiijo01 |
| 1 | 2 | Luka Dončić | 24 | DAL | PG | 70 | 70 | 37.5 | 11.5 | 23.6 | ... | 8.4 | 9.2 | 9.8 | 1.4 | 0.5 | 4.0 | 2.1 | 33.9 | MVP-3CPOY-6ASNBA1 | doncilu01 |
| 2 | 3 | Giannis Antetokounmpo | 29 | MIL | PF | 73 | 73 | 35.2 | 11.5 | 18.8 | ... | 8.8 | 11.5 | 6.5 | 1.2 | 1.1 | 3.4 | 2.9 | 30.4 | MVP-4DPOY-9CPOY-12ASNBA1 | antetgi01 |
| 3 | 4 | Shai Gilgeous-Alexander | 25 | OKC | PG | 75 | 75 | 34.0 | 10.6 | 19.8 | ... | 4.7 | 5.5 | 6.2 | 2.0 | 0.9 | 2.2 | 2.5 | 30.1 | MVP-2DPOY-7CPOY-3ASNBA1 | gilgesh01 |
| 4 | 5 | Jalen Brunson | 27 | NYK | PG | 77 | 77 | 35.4 | 10.3 | 21.4 | ... | 3.1 | 3.6 | 6.7 | 0.9 | 0.2 | 2.4 | 1.9 | 28.7 | MVP-5CPOY-5ASNBA2 | brunsja01 |
5 rows × 32 columns
In [56]:
# Print summary of the DataFrame
# This displays information about the DataFrame including:
# - Total number of rows and columns
# - Column names and their data types
# - Non-null value counts for each column
# - Memory usage of the DataFrame
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 572 entries, 0 to 571 Data columns (total 32 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Rk 572 non-null int64 1 Player 572 non-null object 2 Age 572 non-null int64 3 Team 572 non-null object 4 Pos 572 non-null object 5 G 572 non-null int64 6 GS 572 non-null int64 7 MP 572 non-null float64 8 FG 572 non-null float64 9 FGA 572 non-null float64 10 FG% 568 non-null float64 11 3P 572 non-null float64 12 3PA 572 non-null float64 13 3P% 540 non-null float64 14 2P 572 non-null float64 15 2PA 572 non-null float64 16 2P% 567 non-null float64 17 eFG% 568 non-null float64 18 FT 572 non-null float64 19 FTA 572 non-null float64 20 FT% 537 non-null float64 21 ORB 572 non-null float64 22 DRB 572 non-null float64 23 TRB 572 non-null float64 24 AST 572 non-null float64 25 STL 572 non-null float64 26 BLK 572 non-null float64 27 TOV 572 non-null float64 28 PF 572 non-null float64 29 PTS 572 non-null float64 30 Awards 55 non-null object 31 Player-additional 572 non-null object dtypes: float64(23), int64(4), object(5) memory usage: 143.1+ KB
In [57]:
# Print a summary of descriptive statistics for the dataframe
# This includes count, mean, std, min, 25%, 50%, 75%, max for numeric columns
df.describe()
Out[57]:
| Rk | Age | G | GS | MP | FG | FGA | FG% | 3P | 3PA | ... | FT% | ORB | DRB | TRB | AST | STL | BLK | TOV | PF | PTS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 572.000000 | 572.000000 | 572.000000 | 572.000000 | 572.000000 | 572.00000 | 572.000000 | 568.000000 | 572.000000 | 572.000000 | ... | 537.000000 | 572.000000 | 572.000000 | 572.000000 | 572.000000 | 572.000000 | 572.000000 | 572.000000 | 572.000000 | 572.000000 |
| mean | 286.500000 | 25.743007 | 46.153846 | 21.503497 | 18.650699 | 3.12028 | 6.683392 | 0.452750 | 0.940909 | 2.646329 | ... | 0.747110 | 0.857692 | 2.522902 | 3.373077 | 2.001399 | 0.591259 | 0.402797 | 0.984615 | 1.490909 | 8.423252 |
| std | 165.266452 | 4.225107 | 25.535953 | 26.906890 | 9.906921 | 2.45884 | 4.991255 | 0.107259 | 0.879363 | 2.241818 | ... | 0.157051 | 0.746194 | 1.831697 | 2.425364 | 1.873410 | 0.390106 | 0.413213 | 0.795813 | 0.787215 | 6.791411 |
| min | 1.000000 | 19.000000 | 1.000000 | 0.000000 | 0.500000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 143.750000 | 23.000000 | 22.000000 | 0.000000 | 10.800000 | 1.30000 | 3.000000 | 0.409750 | 0.200000 | 0.900000 | ... | 0.686000 | 0.300000 | 1.200000 | 1.600000 | 0.700000 | 0.300000 | 0.100000 | 0.400000 | 1.000000 | 3.400000 |
| 50% | 286.500000 | 25.000000 | 51.000000 | 7.000000 | 17.350000 | 2.40000 | 5.100000 | 0.452500 | 0.700000 | 2.100000 | ... | 0.772000 | 0.700000 | 2.200000 | 3.000000 | 1.300000 | 0.550000 | 0.300000 | 0.700000 | 1.500000 | 6.400000 |
| 75% | 429.250000 | 28.000000 | 69.000000 | 41.250000 | 27.000000 | 4.50000 | 9.100000 | 0.500000 | 1.500000 | 3.825000 | ... | 0.833000 | 1.100000 | 3.400000 | 4.400000 | 2.725000 | 0.825000 | 0.600000 | 1.400000 | 2.100000 | 11.700000 |
| max | 572.000000 | 39.000000 | 84.000000 | 82.000000 | 37.800000 | 11.50000 | 23.600000 | 0.747000 | 4.800000 | 11.800000 | ... | 1.000000 | 4.600000 | 10.100000 | 13.700000 | 10.900000 | 2.100000 | 3.600000 | 4.400000 | 3.600000 | 34.700000 |
8 rows × 27 columns
In [58]:
# Check for the number of null values in each column
# .isnull() creates a boolean mask where True indicates null values
# .sum() counts the number of True values (nulls) in each column
# .head() shows only the first few results
df.isnull().sum().head()
Out[58]:
Rk 0 Player 0 Age 0 Team 0 Pos 0 dtype: int64
In [59]:
# Count the number of duplicate rows in the dataframe
# Returns the sum of True values (where rows are duplicates)
df.duplicated().sum()
Out[59]:
0
In [60]:
# Fill all null values in the dataframe with 0
# The inplace=True parameter modifies the original dataframe instead of returning a new one
df.fillna(0, inplace=True)
In [61]:
# Convert the Age column from float to integer type
# This ensures age values are whole numbers without decimal points
df["Age"] = df["Age"].astype(int)
In [62]:
# Rename the DataFrame columns to more descriptive names
# This improves readability and makes the data more self-explanatory
# The dictionary maps original column names to new, more descriptive names
# inplace=True means the DataFrame is modified directly without creating a copy
# df.rename(columns={
'PName': 'Player_Name', # Player's full name
'POS': 'Position', # Player's position on the team
'Team': 'Team_Abbreviation', # NBA team abbreviation
'Age': 'Age', # Player's age
'GP': 'Games_Played', # Number of games played
'W': 'Wins', # Number of wins
'L': 'Losses', # Number of losses
'Min': 'Minutes_Played', # Total minutes played
'PTS': 'Total_Points', # Total points scored
'FGM': 'Field_Goals_Made', # Field goals made
'FGA': 'Field_Goals_Attempted', # Field goals attempted
'FG%': 'Field_Goal_Percentage', # Field goal percentage
'3PM': 'Three_Point_FG_Made', # Three-point field goals made
'3PA': 'Three_Point_FG_Attempted', # Three-point field goals attempted
'3P%': 'Three_Point_FG_Percentage', # Three-point field goal percentage
'FTM': 'Free_Throws_Made', # Free throws made
'FTA': 'Free_Throws_Attempted', # Free throws attempted
'FT%': 'Free_Throw_Percentage', # Free throw percentage
'OREB': 'Offensive_Rebounds', # Offensive rebounds
'DREB': 'Defensive_Rebounds', # Defensive rebounds
'REB': 'Total_Rebounds', # Total rebounds (OREB + DREB)
'AST': 'Assists', # Assists
'TOV': 'Turnovers', # Turnovers
'STL': 'Steals', # Steals
'BLK': 'Blocks', # Blocks
'PF': 'Personal_Fouls', # Personal fouls
'FP': 'NBA_Fantasy_Points', # NBA fantasy points
'DD2': 'Double_Doubles', # Double-doubles achieved
'TD3': 'Triple_Doubles', # Triple-doubles achieved
'+/-': 'Plus_Minus' # Plus-minus statistic
}, inplace=True)
Cell In[62], line 6 'PName': 'Player_Name', # Player's full name ^ IndentationError: unexpected indent
In [102]:
# Create a histogram using Plotly Express to visualize the distribution of player positions
# - df: The dataframe containing player data
# - x='Pos': Plot the 'Pos' (Position) column on the x-axis
# - color_discrete_sequence: Use the Vivid color palette for better visualization
# - title: Set the chart title to 'Players position value counts'
# - template: Use the dark theme for the plot background and styling
import plotly.express as px
px.histogram(df, x='Pos', color_discrete_sequence=px.colors.qualitative.Vivid,
title='Players position value counts',template='plotly_dark')
In [104]:
# Group data by position and calculate the mean points per game
# This creates a new DataFrame that shows the average points scored by players in each position
# First, we group the data by the 'Pos' (position) column
# Then we calculate the mean of 'PTS' (points) for each position group
# reset_index() converts the grouped result back to a regular DataFrame with 'Pos' as a column
position_stats = df.groupby(['Pos']).agg({'PTS': 'mean'}).reset_index()
position_stats
Out[104]:
| Pos | PTS | |
|---|---|---|
| 0 | C | 8.383158 |
| 1 | PF | 8.743860 |
| 2 | PG | 9.473333 |
| 3 | SF | 7.568293 |
| 4 | SG | 8.142963 |
In [106]:
# Create a bar chart showing average points per game for each position
fig = px.bar(position_stats, x='Pos', y='PTS',
color='PTS', # Color based on average PTS
color_continuous_scale='Bluered', # Beautiful color scale from blue to red
title='Average Points Per Game by Position', # Chart title
labels={'Pos': 'Player Position', 'PTS': 'Avg Points Per Game'}, # Axis labels
template='plotly_dark') # Dark theme for better visualization
# Display the interactive plotly chart in the notebook
# This renders the chart and allows for interactive features like hover tooltips
fig.show()
In [108]:
# Sort the dataframe by Total Rebounds (TRB) in descending order
# Select only the 'Player' and 'TRB' columns
# Display the top 5 players with the most rebounds
print(df.sort_values(by='TRB', ascending=False)[['Player', 'TRB']].head())
Player TRB 57 Domantas Sabonis 13.7 105 Rudy Gobert 12.9 18 Anthony Davis 12.6 11 Nikola Jokić 12.4 110 Jalen Duren 11.6
In [110]:
# Calculate Defensive Performance by summing blocks and steals
df['Defensive_Performance'] = df['BLK'] + df['STL']
# Get the top 10 players with the highest defensive performance
best_defending_players = df.sort_values(by='Defensive_Performance', ascending=False).head(10)
# Define a color list for each bar to make the visualization more distinguishable
colors = ['red', 'blue', 'green', 'yellow', 'purple', 'orange', 'cyan', 'pink', 'lime', 'magenta']
# Create a bar chart using Plotly
fig_defending = go.Figure()
# Add the bar trace with player names on x-axis and defensive performance on y-axis
fig_defending.add_trace(go.Bar(
x=best_defending_players['Player'],
y=best_defending_players['Defensive_Performance'],
marker=dict(color=colors))) # Apply the colors to the bars
# Configure the layout of the chart with title, axis labels and dimensions
fig_defending.update_layout(
title='Top 10 Best Defending Players',
xaxis_title='Player Name',
yaxis_title='Defensive Performance (Combined Blocks and Steals)',
height=500,
width=1000,
template='plotly_dark' # Use dark theme for better visualization
)
In [111]:
# Group the dataframe by position ('Pos'), calculate the mean points ('PTS') for each position
# Reset the index to make 'Pos' a column instead of an index
# Sort the results in descending order based on average points
avg_pts_by_pos = df.groupby('Pos')['PTS'].mean().reset_index().sort_values(by='PTS', ascending=False)
# Display the top rows of the resulting dataframe
avg_pts_by_pos.head()
Out[111]:
| Pos | PTS | |
|---|---|---|
| 2 | PG | 9.473333 |
| 1 | PF | 8.743860 |
| 0 | C | 8.383158 |
| 4 | SG | 8.142963 |
| 3 | SF | 7.568293 |
In [112]:
# Group data by team, sum the points for each team, convert to DataFrame
# Then sort teams by total points in descending order
# Display the top 5 teams with highest total points
total_pts_by_team = df.groupby('Team')['PTS'].sum().reset_index().sort_values(by='PTS', ascending=False)
total_pts_by_team.head()
Out[112]:
| Team | PTS | |
|---|---|---|
| 0 | 2TM | 513.5 |
| 16 | MEM | 217.5 |
| 26 | POR | 169.1 |
| 2 | ATL | 167.5 |
| 5 | CHI | 161.3 |
In [113]:
# Calculate scoring efficiency for each player by dividing points by field goal attempts
df['Scoring_Efficiency'] = df['PTS'] / df['FGA']
# Find the player with the highest scoring efficiency
# idxmax() returns the index of the maximum value in the Scoring_Efficiency column
highest_eff = df.loc[df['Scoring_Efficiency'].idxmax()]
# Print the player's name, points, field goal attempts, and scoring efficiency
print(highest_eff[['Player', 'PTS', 'FGA', 'Scoring_Efficiency']])
Player Quenton Jackson PTS 0.7 FGA 0.3 Scoring_Efficiency 2.333333 Name: 552, dtype: object
In [114]:
# Which team has the best average assist-to-turnover ratio (AST/TO)?
# Calculate the assist-to-turnover ratio for each row, adding a small epsilon (1e-5) to avoid division by zero
df['AST_TO_Ratio'] = df['AST'] / (df['TOV'] + 1e-5)
# Group data by team, calculate the mean AST_TO_Ratio for each team, convert to DataFrame, and sort in descending order
team_ast_to = df.groupby('Team')['AST_TO_Ratio'].mean().reset_index().sort_values(by='AST_TO_Ratio', ascending=False)
# Display the top teams with the best assist-to-turnover ratios
team_ast_to.head()
Out[114]:
| Team | AST_TO_Ratio | |
|---|---|---|
| 29 | TOR | 17649.184131 |
| 22 | OKC | 13335.311827 |
| 24 | PHI | 10001.880022 |
| 4 | BRK | 10001.680836 |
| 13 | IND | 6251.840948 |
In [115]:
# Analyzing correlation between key basketball statistics
# Define the columns we want to analyze: Points, Rebounds, Assists, and Minutes Played
cols = ['PTS', 'TRB', 'AST', 'MP']
# Calculate the correlation matrix between these statistics
# This shows how strongly each pair of variables is related
correlation_matrix = df[cols].corr()
# Display the correlation matrix
# Values close to 1 indicate strong positive correlation
# Values close to -1 indicate strong negative correlation
# Values close to 0 indicate little to no correlation
print(correlation_matrix)
PTS TRB AST MP PTS 1.000000 0.664955 0.781182 0.892276 TRB 0.664955 1.000000 0.451020 0.718441 AST 0.781182 0.451020 1.000000 0.748452 MP 0.892276 0.718441 0.748452 1.000000
In [116]:
# Create a heatmap visualization of the correlation matrix
# - annot=True displays the correlation values in each cell
# - cmap='coolwarm' sets the color scheme (blue for negative, red for positive correlations)
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
# Add a descriptive title to the heatmap
plt.title("Correlation Between Key Stats")
# Display the heatmap
plt.show()
In [117]:
# Calculate defensive contribution by adding blocks and steals
df['DEF'] = df['BLK'] + df['STL']
# Group data by position, calculate mean defensive stats, and sort in descending order
defense_by_pos = df.groupby('Pos')['DEF'].mean().reset_index().sort_values(by='DEF', ascending=False)
# Display the top positions with highest defensive stats
defense_by_pos.head()
Out[117]:
| Pos | DEF | |
|---|---|---|
| 0 | C | 1.335789 |
| 1 | PF | 1.053509 |
| 2 | PG | 0.954286 |
| 3 | SF | 0.896748 |
| 4 | SG | 0.822963 |
In [118]:
# Who are the most all-around players (high across many stats)?
# Create a combined score by summing key statistical categories
# Calculate an All_Around_Score by adding points, rebounds, assists, steals and blocks
df['All_Around_Score'] = df[['PTS', 'TRB', 'AST', 'STL', 'BLK']].sum(axis=1)
# Sort players by their All_Around_Score in descending order and select only relevant columns
all_around_players = df.sort_values(by='All_Around_Score', ascending=False)[['Player', 'All_Around_Score']]
# Display the top 10 most all-around players
all_around_players.head(10)
Out[118]:
| Player | All_Around_Score | |
|---|---|---|
| 1 | Luka Dončić | 54.8 |
| 0 | Joel Embiid | 54.2 |
| 2 | Giannis Antetokounmpo | 50.7 |
| 11 | Nikola Jokić | 50.1 |
| 3 | Shai Gilgeous-Alexander | 44.7 |
| 18 | Anthony Davis | 44.3 |
| 14 | LeBron James | 43.1 |
| 57 | Domantas Sabonis | 42.8 |
| 7 | Jayson Tatum | 41.5 |
| 6 | Kevin Durant | 40.8 |
In [119]:
# Filter the dataframe to only include rows where the "Player" column equals "LeBron James"
# This creates a new dataframe containing only LeBron James' statistics
lebron_stats = df[df["Player"] == "LeBron James"]
# Display the filtered dataframe with LeBron's stats
lebron_stats
Out[119]:
| Rk | Player | Age | Team | Pos | G | GS | MP | FG | FGA | ... | TOV | PF | PTS | Awards | Player-additional | Defensive_Performance | Scoring_Efficiency | AST_TO_Ratio | DEF | All_Around_Score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 14 | 15 | LeBron James | 39 | LAL | PF | 71 | 71 | 35.3 | 9.6 | 17.9 | ... | 3.5 | 1.1 | 25.7 | CPOY-10ASNBA3 | jamesle01 | 1.8 | 1.435754 | 2.371422 | 1.8 | 43.1 |
1 rows × 37 columns
In [120]:
# Example efficiency calculation (SIMPLIFIED)
# Calculate player efficiency by adding positive stats (points, rebounds, assists, steals, blocks)
# and subtracting negative stats (missed field goals and missed free throws)
df["Efficiency"] = (
(df["PTS"] + df["TRB"] + df["AST"] + df["STL"] + df["BLK"]) - # Sum of positive contributions
(df["FGA"] - df["FG"]) - # Missed field goals (attempts minus makes)
(df["FTA"] - df["FT"]) # Missed free throws (attempts minus makes)
)
# Show LeBron's efficiency rating
# Filter the dataframe for LeBron James and extract his efficiency value
lebron_eff = df[df["Player"] == "LeBron James"]["Efficiency"]
print(f"Lebron's Efficiency Rating: {lebron_eff.values[0]}") # Print the first value from the filtered series
Lebron's Efficiency Rating: 33.4
In [121]:
# Select top 10 NBA players based on efficiency rating
top_players = df.nlargest(10, "Efficiency")
# Create a bar chart with specified dimensions
plt.figure(figsize=(10, 5))
# Plot bars with player names on x-axis and efficiency values on y-axis
plt.bar(top_players["Player"], top_players["Efficiency"], color="purple")
# Add x-axis label
plt.xlabel("Player")
# Add y-axis label
plt.ylabel("Efficiency Rating")
# Add chart title
plt.title("Top 10 NBA Players By Efficiency")
# Rotate x-axis labels for better readability
plt.xticks(rotation=45)
# Display the chart
plt.show()
Data Visualization¶
In [123]:
# Top 10 scorers
top_scorers = df.sort_values("PTS", ascending=False).head(10) # Sort dataframe by points in descending order and get top 10 players
sns.barplot(x="PTS", y="Player", data=top_scorers) # Create a horizontal bar plot showing points for each player
plt.title("Top 10 Scorers") # Add title to the plot
plt.show() # Display the plot
# Correlation heatmap
plt.figure(figsize=(10, 8)) # Create a new figure with specified size
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm') # Create heatmap of correlation matrix with values shown and using coolwarm color palette
plt.title("Correlation Matrix") # Add title to the heatmap
plt.show() # Display the heatmap
Supervised Learning (Classification)¶
In [125]:
# Can we predict if a player scores more than 20 PPG?
from sklearn.model_selection import train_test_split # Import function to split data into training and testing sets
from sklearn.ensemble import RandomForestClassifier # Import Random Forest classifier algorithm
from sklearn.metrics import classification_report # Import tool to evaluate classification performance
# Create binary target variable: 1 if player scores >= 20 points per game, 0 otherwise
df['HighScorer'] = (df['PTS'] >= 20).astype(int)
# Select relevant features that might help predict high scorers
features = ["AST", "TRB", "FG%", "3P%", "FT%", "MP"] # Assists, Rebounds, Field Goal %, 3-Point %, Free Throw %, Minutes Played
X = df[features] # Feature matrix
y = df['HighScorer'] # Target vector
# Split data into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Initialize Random Forest classifier with default parameters
clf = RandomForestClassifier()
clf.fit(X_train, y_train) # Train the model on training data
# Make predictions on test data and evaluate performance
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred)) # Print precision, recall, f1-score and support metrics
precision recall f1-score support
0 0.97 0.96 0.97 158
1 0.60 0.64 0.62 14
accuracy 0.94 172
macro avg 0.78 0.80 0.79 172
weighted avg 0.94 0.94 0.94 172
Unsupervised Learning (Clustering)¶
In [127]:
Can we group players into performance metrics?
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
# Select relevant performance metrics for clustering analysis
# PTS = Points, AST = Assists, TRB = Total Rebounds, STL = Steals, BLK = Blocks, TOV = Turnovers
X_cluster = df[["PTS", "AST", "TRB", "STL", "BLK", "TOV"]]
# Scale the data to have mean=0 and variance=1
# This is important because KMeans is sensitive to the scale of the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cluster)
# Apply KMeans clustering algorithm with 3 clusters
# random_state ensures reproducibility of results
kmeans = KMeans(n_clusters=3, random_state=42)
df["Cluster"] = kmeans.fit_predict(X_scaled) # Assign cluster labels to each player
# Create a scatter plot to visualize the clusters
# Using points and assists as dimensions, with color indicating cluster membership
sns.scatterplot(data=df, x="PTS", y="AST", hue="Cluster", palette="Set2")
plt.title("Player Clusters Based on Performance")
plt.show()
Object `metrics` not found.
Model Selection and Evaluation¶
In [129]:
# Compare classifiers for high scorer prediction:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
# Dictionary of classifier models to evaluate
models = {
"RandomForest": RandomForestClassifier(), # Random Forest classifier (imported earlier)
"LogisticRegression": LogisticRegression(max_iter=1000), # Logistic Regression with increased iterations
"SVM": SVC(), # Support Vector Machine with default parameters
"GradientBoosting": GradientBoostingClassifier() # Gradient Boosting classifier
}
# Evaluate each model using 5-fold cross-validation
for name, model in models.items():
# Calculate accuracy scores across 5 folds
score = cross_val_score(model, X, y, cv=5, scoring="accuracy")
# Print the mean accuracy for each model
print(f"{name} Mean Accuracy: {score.mean():.2f}")
RandomForest Mean Accuracy: 0.90 LogisticRegression Mean Accuracy: 0.90 SVM Mean Accuracy: 0.89 GradientBoosting Mean Accuracy: 0.88
Technique to Handle It: PCA (Principal Component Analysis)¶
PCA reduces dimensionality by converting correlated features into uncorrelated principal components.
In [131]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# Step 1: Select relevant numeric features for player performance analysis
# These features represent key basketball statistics like points, assists, rebounds, etc.
features = ["PTS", "AST", "TRB", "STL", "BLK", "TOV", "FG%", "3P%", "FT%", "MP"]
X = df[features].fillna(0) # Fill missing values with zeros to handle incomplete data
# Step 2: Standardize the data
# This ensures all features contribute equally to the analysis regardless of their scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Step 3: Apply Principal Component Analysis
# PCA reduces dimensionality while preserving as much variance as possible
# Using all features as components initially to analyze their importance
pca = PCA(n_components=len(features))
X_pca = pca.fit_transform(X_scaled)
# Step 4: Calculate cumulative explained variance
# This shows how much information is retained with each additional component
explained_var = np.cumsum(pca.explained_variance_ratio_)
# Plot cumulative explained variance to visualize component importance
# This helps determine how many components to keep for further analysis
plt.figure(figsize=(9, 5))
plt.plot(range(1, len(features)+1), explained_var, marker='o', linestyle='--')
plt.title("Explained Variance by Principal Components")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.grid(True)
plt.show()
What is a Random Forest?
A Random Forest is:
An ensemble of Decision Trees It uses bagging (Bootstrap Aggregation) to train multiple trees on different subsets of data Each tree votes, and the majority vote wins (for classification) Handles overfitting better than a single decision tree
In [133]:
# Predict if a player is a "High Scorer" (PTS >= 20)
from sklearn.model_selection import train_test_split # Import function to split data into training and testing sets
from sklearn.ensemble import RandomForestClassifier # Import Random Forest classifier algorithm
from sklearn.metrics import classification_report, accuracy_score # Import evaluation metrics
# Create a target column: HighScorer (1 if PTS >= 20)
# This creates a binary classification target - players who score 20+ points are labeled as 1
df['HighScorer'] = (df['PTS'] >= 20).astype(int)
# Choose numerical features (you can customize)
# These statistics will be used to predict if a player is a high scorer
features = ['AST', 'TRB', 'STL', 'BLK', 'TOV', 'MP', 'FG%', '3P%', 'FT%']
df = df.dropna(subset=features) # Remove any rows with missing values in our selected features
X = df[features] # Feature matrix
y = df['HighScorer'] # Target vector
# Split into training/testing sets
# 70% of data used for training, 30% for testing, with fixed random state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Train the Random Forest
# Create a Random Forest classifier with 100 decision trees
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train) # Train the model on our training data
# Predict
# Use the trained model to predict high scorers in the test set
y_pred = rf.predict(X_test)
# Evaluation
# Calculate and display the accuracy of our predictions
print("Accuracy:", accuracy_score(y_test, y_pred))
# Display detailed classification metrics (precision, recall, f1-score)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Accuracy: 0.9476744186046512
Classification Report:
precision recall f1-score support
0 0.97 0.97 0.97 158
1 0.67 0.71 0.69 14
accuracy 0.95 172
macro avg 0.82 0.84 0.83 172
weighted avg 0.95 0.95 0.95 172
Feature Importance¶
In [135]:
# Calculate feature importances from the trained Random Forest model
data = df[['PTS', 'AST', 'TRB', 'MP']].dropna()
# Create binary target - players are classified as high scorers if their points are above the median
data['HighScorer'] = (data['PTS'] > data['PTS'].median()).astype(int)
# Define features (assists, rebounds, minutes played) and target variable (high scorer classification)
X = data[['AST', 'TRB', 'MP']]
y = data['HighScorer']
# Standardize features to have mean=0 and variance=1 for better model performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split data into training (80%) and testing (20%) sets with fixed random state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Initialize and train a Random Forest Classifier with fixed random state
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
# Extract feature importance scores from the trained model
importances = rf.feature_importances_
feature_names = X.columns
# Visualize feature importances using a bar plot
plt.figure(figsize=(6, 4))
sns.barplot(x=importances, y=feature_names)
plt.title('Feature Importances from Random Forest')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()
In [ ]:
What is a Decision Tree?
A Decision Tree:
Is a flowchart-like structure Splits the data based on feature thresholds Is interpretable, fast, and good for both classification and regression
In [137]:
# Use Case: Classify Players as "High Scorer" (PTS ≥ 20)
from sklearn.model_selection import train_test_split # Import function to split data into training and testing sets
from sklearn.tree import DecisionTreeClassifier, plot_tree # Import decision tree classifier and visualization tool
from sklearn.metrics import classification_report, accuracy_score # Import evaluation metrics
# Create binary target variable: 1 if player scores 20+ points, 0 otherwise
df['HighScorer'] = (df['PTS'] >= 20).astype(int)
# Select features that might predict scoring ability
features = ['AST', 'TRB', 'STL', 'BLK', 'TOV', 'MP', 'FG%', '3P%', 'FT%']
df = df.dropna(subset=features) # Remove rows with missing values in any feature
X = df[features] # Feature matrix
y = df['HighScorer'] # Target vector
# Split data: 70% for training, 30% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Create decision tree with limited depth to prevent overfitting
tree = DecisionTreeClassifier(max_depth=4, random_state=42)
tree.fit(X_train, y_train) # Train the model on training data
# Make predictions on test data
y_pred = tree.predict(X_test)
# Evaluate model performance
print("Accuracy:", accuracy_score(y_test, y_pred)) # Percentage of correct predictions
print("\nClassification Report:\n", classification_report(y_test, y_pred)) # Detailed metrics by class
Accuracy: 0.9127906976744186
Classification Report:
precision recall f1-score support
0 0.97 0.93 0.95 158
1 0.48 0.71 0.57 14
accuracy 0.91 172
macro avg 0.72 0.82 0.76 172
weighted avg 0.93 0.91 0.92 172
In [138]:
# Visualizing the Decision Tree model
# Create a figure with specified dimensions (width=20, height=10 inches)
plt.figure(figsize=(20, 10))
# Plot the decision tree with feature names and class labels
# - feature_names: Uses the features list to label the decision nodes
# - class_names: Labels the leaf nodes as "Not High" or "High"
# - filled=True: Colors the nodes based on the majority class
plot_tree(tree, feature_names=features, class_names=["Not High", "High"], filled=True)
# Add a descriptive title to the visualization
plt.title("Decision Tree for High Scorer Classification")
# Display the plot
plt.show()
In [139]:
df = pd.read_csv("/Users/bcoeur34/Desktop/Python /games.csv")
df.head()
Out[139]:
| GAME_DATE_EST | GAME_ID | GAME_STATUS_TEXT | HOME_TEAM_ID | VISITOR_TEAM_ID | SEASON | TEAM_ID_home | PTS_home | FG_PCT_home | FT_PCT_home | ... | AST_home | REB_home | TEAM_ID_away | PTS_away | FG_PCT_away | FT_PCT_away | FG3_PCT_away | AST_away | REB_away | HOME_TEAM_WINS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2022-12-22 | 22200477 | Final | 1610612740 | 1610612759 | 2022 | 1610612740 | 126.0 | 0.484 | 0.926 | ... | 25.0 | 46.0 | 1610612759 | 117.0 | 0.478 | 0.815 | 0.321 | 23.0 | 44.0 | 1 |
| 1 | 2022-12-22 | 22200478 | Final | 1610612762 | 1610612764 | 2022 | 1610612762 | 120.0 | 0.488 | 0.952 | ... | 16.0 | 40.0 | 1610612764 | 112.0 | 0.561 | 0.765 | 0.333 | 20.0 | 37.0 | 1 |
| 2 | 2022-12-21 | 22200466 | Final | 1610612739 | 1610612749 | 2022 | 1610612739 | 114.0 | 0.482 | 0.786 | ... | 22.0 | 37.0 | 1610612749 | 106.0 | 0.470 | 0.682 | 0.433 | 20.0 | 46.0 | 1 |
| 3 | 2022-12-21 | 22200467 | Final | 1610612755 | 1610612765 | 2022 | 1610612755 | 113.0 | 0.441 | 0.909 | ... | 27.0 | 49.0 | 1610612765 | 93.0 | 0.392 | 0.735 | 0.261 | 15.0 | 46.0 | 1 |
| 4 | 2022-12-21 | 22200468 | Final | 1610612737 | 1610612741 | 2022 | 1610612737 | 108.0 | 0.429 | 1.000 | ... | 22.0 | 47.0 | 1610612741 | 110.0 | 0.500 | 0.773 | 0.292 | 20.0 | 47.0 | 0 |
5 rows × 21 columns
In [140]:
## Find overall wins and winning percentages
print(df['HOME_TEAM_WINS'].value_counts())
print(df['HOME_TEAM_WINS'].value_counts(normalize = True))
HOME_TEAM_WINS 1 15645 0 11006 Name: count, dtype: int64 HOME_TEAM_WINS 1 0.587032 0 0.412968 Name: proportion, dtype: float64
In [141]:
home_wins = df['HOME_TEAM_WINS'].value_counts()
total_games = len(df)
win_pct = (home_wins[1] / total_games) * 100
loss_pct = (home_wins[0] / total_games) * 100
print(f"Home teams win {win_pct:.2f}% of games and lose {loss_pct:.2f}%.")
Home teams win 58.70% of games and lose 41.30%.
In [142]:
# Count wins
x = df['HOME_TEAM_WINS'].value_counts()
y = df['HOME_TEAM_WINS'].value_counts()
# Bar positions
labels = ['Games']
positions = np.arange(len(labels))
# Plot bars
plt.bar(positions, x[1], width=0.4, color='blue', label='Home Wins')
plt.bar(positions + 0.4, y[0], width=0.4, color='green', label='Away Wins')
# Add labels and title
plt.ylabel('Number of Wins')
plt.title('Home vs Away Wins')
plt.xticks(positions + 0.2, labels)
plt.legend()
# Show plot
plt.show()
In [143]:
# Encode 'win' as 1 if home team scored more, else 0
df["win"] = (df["PTS_home"] > df["PTS_away"]).astype(int)
# Select only necessary columns and drop rows with any missing values
features = ["FG_PCT_home", "FG_PCT_away", "REB_home", "AST_home", "win"]
df = df[features].dropna()
# Split features and target
x = df[["FG_PCT_home", "FG_PCT_away", "REB_home", "AST_home"]]
y = df["win"]
# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=42)
# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Prediction Accuracy: {accuracy:.2f}")
Prediction Accuracy: 0.79
In [144]:
from sklearn.metrics import ConfusionMatrixDisplay
# Display confusion matrix
ConfusionMatrixDisplay.from_estimator(model, X_test, y_test)
Out[144]:
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1483f2150>
In [145]:
# Example of matches up
new_game = pd.DataFrame([{"FG_PCT_home":0.28,
"FG_PCT_away":0.45,
"REB_home":44,
"AST_home":25}])
prediction = model.predict(new_game)
print("predicted win" if prediction[0] == 1 else "predicted loss")
predicted loss
In [146]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
# Load the dataset
df = pd.read_csv("/Users/bcoeur34/Desktop/Python /nba_2024.csv")
# Drop rows with missing values in relevant columns
data = df[['PTS', 'AST', 'TRB', 'MP']].dropna()
# Define features and target
X = data[['AST', 'TRB', 'MP']]
y = data['PTS']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R² Score:", r2)
# Visualize actual vs predicted points
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='blue')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--') # Line y = x
plt.xlabel('Actual Points')
plt.ylabel('Predicted Points')
plt.title('Actual vs Predicted Points')
plt.grid(True)
plt.tight_layout()
plt.show()
Mean Squared Error: 9.220250694400955 R² Score: 0.8233828431624677
In [ ]:
In [ ]: